We first need to load the packages we’ll use and set up a theme for our plots. The main workhorse for our networks will be tidygraph and ggraph. As this is a basic primer, you can find more info on these packages here:
library(tidyverse) # For data manipulation/graphing
library(tidygraph) # Represent network data in tabular format
library(cowplot) # Make ggplot prettier
library(ggraph) # Graph networks
theme_set(theme_cowplot(12)) # Minimalist theme
# Colorblind friendly palette
cbPalette <- c("#999999", #grey
"#E69F00", #orange
"#56B4E9", #sky blue
"#009E73", #bluish green
"#F0E442", #yellow
"#0072B2", #blue
"#D55E00", #vermillion
"#CC79A7") #reddish purple
Most network data can be represented as a matrix. Take for example the following data pertaining to expression of Vibrio cholerae virulence genes across 300 samples. Say we want to visualize correlation between genes/across samples. We will start wit raw data with normalized reads per transcript across experiments
# Raw data with normalized reads per transcript across experiments
virulence_tbl <- read_csv("data/VPI1_VPI2.csv") %>%
filter(!(Transcript%in%c(paste0("VC",c(1785:1900)))))
head(virulence_tbl)
## # A tibble: 6 x 301
## Transcript Tn_1 Tn_2 Tn_3 TntfoX.strep_1 TntfoX.strep_2 TntfoX.strep_3
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 VC0070 8.88 9.36 8.76 8.90 8.9 8.69
## 2 VC0583 15.6 15.2 15.2 15.3 14.9 15.1
## 3 VC0817 8.86 9.61 8.99 8.99 9.16 8.67
## 4 VC0818 7.50 8.08 7.38 7.38 7.33 7.29
## 5 VC0819 7.52 7.85 7.47 7.64 7.61 7.31
## 6 VC0820 7.55 7.90 7.73 7.79 7.80 7.62
## # … with 294 more variables: TntfoY.strep_1 <dbl>, TntfoY.strep_2 <dbl>,
## # TnqstR_1 <dbl>, TnqstR_2 <dbl>, FRT.Tn_1 <dbl>, FRT.TntfoX.strep_1 <dbl>,
## # FRT.TntfoX.strep_2 <dbl>, FRT.TntfoX.strep_3 <dbl>,
## # FRT.TntfoY.strep_1 <dbl>, FRT.TntfoY.strep_2 <dbl>,
## # FRT.TntfoY.strep_3 <dbl>, FRT.TnqstR_1 <dbl>, FRT.TnqstR_2 <dbl>,
## # FRT.TnqstR_3 <dbl>, TntfoY.strep_3 <dbl>, FRT.Tn_2 <dbl>, TnqstR_3 <dbl>,
## # FRT.Tn_3 <dbl>, delta.qstR_TntfoX_1 <dbl>, delta.qstR_TntfoX_2 <dbl>,
## # delta.hapR._TntfoX_1 <dbl>, delta.hapR._TntfoX_2 <dbl>,
## # delta.qstR_TntfoX_3 <dbl>, delta.hapR._TntfoX_3 <dbl>, WT_TntfoX_1 <dbl>,
## # WT_TntfoX_2 <dbl>, WT_TntfoX_3 <dbl>, WT_TntfoX_NA_1 <dbl>,
## # WT_TntfoX_NA_2 <dbl>, WT_TntfoX_NA_3 <dbl>, WT_no_T_1 <dbl>,
## # WT_no_T_2 <dbl>, WT_no_T_3 <dbl>, TntfoY_1 <dbl>, TntfoY_2 <dbl>,
## # TntfoY_3 <dbl>, Empty_ctl_1 <dbl>, Empty_ctl_2 <dbl>, Empty_ctl_3 <dbl>,
## # vqmr_1 <dbl>, vqmr_2 <dbl>, vqmr_3 <dbl>, water_1 <dbl>, water_2 <dbl>,
## # water_3 <dbl>, AI_1 <dbl>, AI_2 <dbl>, AI_3 <dbl>, CAI_1 <dbl>,
## # CAI_2 <dbl>, CAI_3 <dbl>, DPO_1 <dbl>, DPO_2 <dbl>, DPO_3 <dbl>,
## # AI_CAI_1 <dbl>, AI_CAI_2 <dbl>, AI_DPO_1 <dbl>, AI_DPO_2 <dbl>,
## # AI_CAI_3 <dbl>, AI_DPO_3 <dbl>, CAI_DPO_1 <dbl>, CAI_DPO_2 <dbl>,
## # CAI_DPO_3 <dbl>, AI_CAI_DPO_1 <dbl>, AI_CAI_DPO_2 <dbl>,
## # AI_CAI_DPO_3 <dbl>, Ctl_1 <dbl>, Ctl_2 <dbl>, vchM_KO_1 <dbl>,
## # vchM_KO_2 <dbl>, QS.mutant_1 <dbl>, QS.mutant_TEX_1 <dbl>,
## # QS.mutant_2 <dbl>, QS.mutant_TEX_2 <dbl>, QS.mutant_3 <dbl>,
## # QS.mutant_TEX_3 <dbl>, QS.mutant_4 <dbl>, QS.mutant_TEX_4 <dbl>,
## # WT_C6706_1 <dbl>, WT_C6706_TEX_1 <dbl>, WT_C6706_2 <dbl>,
## # WT_C6706_TEX_2 <dbl>, WT_C6706_3 <dbl>, WT_C6706_TEX_3 <dbl>,
## # WT_C6706_4 <dbl>, WT_C6706_TEX_4 <dbl>, ToxT_KO_1 <dbl>, ToxT_KO_2 <dbl>,
## # ToxT_KO_3 <dbl>, ToxT_Rescue_1 <dbl>, ToxT_Rescue_2 <dbl>,
## # ToxT_Rescue_3 <dbl>, dncV_KO_1 <dbl>, dncV_KO_2 <dbl>, dncV_KO_3 <dbl>,
## # dncV_Rescue_1 <dbl>, dncV_Rescue_2 <dbl>, dncV_Rescue_3 <dbl>,
## # lacZ_1 <dbl>, lacZ_hns_1 <dbl>, …
Next we need to convert this into a matrix with only numbers
vir_matrix <- as.matrix(virulence_tbl %>% column_to_rownames("Transcript"))
vir_matrix[1:10,1:10]
## Tn_1 Tn_2 Tn_3 TntfoX.strep_1 TntfoX.strep_2 TntfoX.strep_3
## VC0070 8.878 9.365 8.756 8.895 8.900 8.686
## VC0583 15.573 15.231 15.178 15.297 14.887 15.092
## VC0817 8.856 9.612 8.988 8.991 9.158 8.667
## VC0818 7.499 8.083 7.375 7.381 7.334 7.286
## VC0819 7.518 7.847 7.471 7.642 7.609 7.312
## VC0820 7.551 7.898 7.726 7.789 7.796 7.625
## VC0821 6.950 7.160 7.060 7.067 6.787 6.951
## VC0822 8.444 8.727 8.546 8.735 8.647 8.445
## VC0823 8.814 9.230 8.799 9.271 9.105 9.014
## VC0824 8.554 8.412 8.516 8.813 8.675 8.918
## TntfoY.strep_1 TntfoY.strep_2 TnqstR_1 TnqstR_2
## VC0070 8.624 8.811 8.807 9.150
## VC0583 15.196 14.897 15.415 15.234
## VC0817 8.564 9.001 8.797 9.389
## VC0818 7.323 7.268 7.333 7.637
## VC0819 7.425 7.568 7.347 7.906
## VC0820 7.546 7.801 7.561 8.029
## VC0821 6.620 6.832 6.854 7.142
## VC0822 8.967 9.059 8.572 8.837
## VC0823 9.187 9.444 8.666 9.291
## VC0824 9.069 8.785 8.540 8.689
Our correlation function, cor(), calculates correlations between columns so we need to rotate our data using the t() (transpose) function.
vir_matrix <- t(vir_matrix)
vir_matrix[1:10,1:10]
## VC0070 VC0583 VC0817 VC0818 VC0819 VC0820 VC0821 VC0822 VC0823
## Tn_1 8.878 15.573 8.856 7.499 7.518 7.551 6.950 8.444 8.814
## Tn_2 9.365 15.231 9.612 8.083 7.847 7.898 7.160 8.727 9.230
## Tn_3 8.756 15.178 8.988 7.375 7.471 7.726 7.060 8.546 8.799
## TntfoX.strep_1 8.895 15.297 8.991 7.381 7.642 7.789 7.067 8.735 9.271
## TntfoX.strep_2 8.900 14.887 9.158 7.334 7.609 7.796 6.787 8.647 9.105
## TntfoX.strep_3 8.686 15.092 8.667 7.286 7.312 7.625 6.951 8.445 9.014
## TntfoY.strep_1 8.624 15.196 8.564 7.323 7.425 7.546 6.620 8.967 9.187
## TntfoY.strep_2 8.811 14.897 9.001 7.268 7.568 7.801 6.832 9.059 9.444
## TnqstR_1 8.807 15.415 8.797 7.333 7.347 7.561 6.854 8.572 8.666
## TnqstR_2 9.150 15.234 9.389 7.637 7.906 8.029 7.142 8.837 9.291
## VC0824
## Tn_1 8.554
## Tn_2 8.412
## Tn_3 8.516
## TntfoX.strep_1 8.813
## TntfoX.strep_2 8.675
## TntfoX.strep_3 8.918
## TntfoY.strep_1 9.069
## TntfoY.strep_2 8.785
## TnqstR_1 8.540
## TnqstR_2 8.689
Now we run cor and figure out and get a beautiful correlation matrix
corr_matrix <- cor(vir_matrix)
corr_matrix[1:10,1:10]
## VC0070 VC0583 VC0817 VC0818 VC0819 VC0820
## VC0070 1.00000000 0.16195291 0.2923376 0.30848546 -0.05990451 -0.1419211
## VC0583 0.16195291 1.00000000 0.4698289 0.05497381 -0.09315693 -0.2112110
## VC0817 0.29233762 0.46982891 1.0000000 0.72052321 0.47087945 0.3783796
## VC0818 0.30848546 0.05497381 0.7205232 1.00000000 0.64973191 0.6223768
## VC0819 -0.05990451 -0.09315693 0.4708795 0.64973191 1.00000000 0.9508117
## VC0820 -0.14192109 -0.21121095 0.3783796 0.62237685 0.95081168 1.0000000
## VC0821 -0.18789479 -0.23536283 0.2802374 0.53283252 0.90239355 0.9491916
## VC0822 -0.15024823 -0.19473395 0.3724130 0.58755926 0.90157928 0.9592002
## VC0823 -0.15392651 -0.03353641 0.4831163 0.66704341 0.81914945 0.8889545
## VC0824 -0.15376359 -0.10678858 0.1112739 0.19691159 0.67498754 0.7047740
## VC0821 VC0822 VC0823 VC0824
## VC0070 -0.1878948 -0.1502482 -0.15392651 -0.1537636
## VC0583 -0.2353628 -0.1947339 -0.03353641 -0.1067886
## VC0817 0.2802374 0.3724130 0.48311632 0.1112739
## VC0818 0.5328325 0.5875593 0.66704341 0.1969116
## VC0819 0.9023936 0.9015793 0.81914945 0.6749875
## VC0820 0.9491916 0.9592002 0.88895446 0.7047740
## VC0821 1.0000000 0.9637688 0.89100152 0.7748621
## VC0822 0.9637688 1.0000000 0.92578074 0.7781170
## VC0823 0.8910015 0.9257807 1.00000000 0.6371455
## VC0824 0.7748621 0.7781170 0.63714551 1.0000000
After converting this matrix into a tidy format we can visualize our results in a heatmap using geom_tile
long_tbl <- corr_matrix %>%
as_tibble(rownames = NA) %>%
rownames_to_column(var = "Transcript1") %>%
pivot_longer(cols = -Transcript1, names_to = "Transcript2",
values_to = "Correlation") %>%
mutate(Transcript1 = as.character(Transcript1))
ggplot(long_tbl, aes(x = Transcript1,
y = Transcript2,
fill = Correlation)) +
geom_tile() +
scale_x_discrete(breaks = c(virulence_tbl$Transcript))+
scale_fill_gradient2(low = "red",
high = "blue",
limits = c(-1,1)) +
theme(axis.text.x = element_text(angle = 45,
hjust = 1))
Man is this busy/ugly! This data would look much better if we could somehow filter to show only interesting connections and avoid redundant info. How do we do that?
We can make a network using tidygraph and ggraph! Tidygraph is an extension of existing table manipulation tidyverse packages (e.g. tidyr, dplyr) that can represent network/graph type data. Ggraph extends ggplot’s grammar/syntax style to visualizing network/graph data.
Let’s start by converting our previous correlation matrix into a tbl_graph object
vir_graph <- as_tbl_graph(corr_matrix)
vir_graph
## # A tbl_graph: 61 nodes and 3721 edges
## #
## # A directed multigraph with 1 component
## #
## # Node Data: 61 x 1 (active)
## name
## <chr>
## 1 VC0070
## 2 VC0583
## 3 VC0817
## 4 VC0818
## 5 VC0819
## 6 VC0820
## # … with 55 more rows
## #
## # Edge Data: 3,721 x 3
## from to weight
## <int> <int> <dbl>
## 1 1 1 1
## 2 1 2 0.162
## 3 1 3 0.292
## # … with 3,718 more rows
As you can see, our tbl_graph is really just two tibbles now, one containing all of our node information and the other containing all the edge data. You can manipulate these tibbles using standard tidyverse functions (e.g. left_join, mutate, filter, select). We will give some examples shortly but let’s first see what this data looks like when we plot it using ggraph. Ggraph is based off of the syntax of ggplot and every network plot will require a ggraph call to specify data you want to graph as well as calls to a node and edge geom.
ggraph(vir_graph) + # ggraph call
geom_node_point() + # node geom
geom_edge_link() # edge geom
That took a while and is exceptionally ugly! Let’s apply a correlation cutoff. We’re only interested in genes with a correlation of > 0.5 or < -0.1 so let’s filter and replot that. To specify which tibble we want to filter on (either edges or nodes) we use the activate function
vir_graph_filtered <- vir_graph %>%
activate(edges) %>%
filter(weight >= 0.5 | weight < -0.1)
ggraph(vir_graph_filtered) +
geom_node_point() +
geom_edge_link()
Looking better, now let’s tweak things a bit so that we know what genes we’re using and the edges aren’t so overwhelming. Most ggraph geoms are analogous to existing ggplot geoms and take similar parameters. Here, we use an alpha value to control how transparent the edges are and use geom_node_text to label our nodes.
ggraph(vir_graph_filtered) +
geom_edge_link(alpha = 0.1) +
geom_node_point() +
geom_node_text(aes(label = name)) # Note that you can use the repel=TRUE option force labels to avoid overlapping
We just went over the basics for tidygraph and ggraph, now let’s dig into the weeds a bit. Here we want to add some color to our graph so that we can quickly figure out some info about our nodes and edges. Let’s color nodes by location on the genome (VPI1, VPI2, or other) and color edges by the correlation between genes
vpi2 <- paste0("VC", c(1757:1810))
vpi1 <- paste0("VC0", c(817:847))
vir_graph_groups <-
vir_graph_filtered %>%
activate(nodes) %>%
mutate(group =
case_when(name %in% vpi1 ~ "VPI1",
name %in% vpi2 ~ "VPI2",
TRUE ~ "Other"))
ggraph(vir_graph_groups) +
geom_edge_link(alpha = 0.25, aes(colour = weight)) +
scale_edge_color_gradient2(low = "red",
mid = "grey50",
high = "blue",
limits = c(-1,1)) +
geom_node_point(aes(colour = group)) +
scale_colour_manual(values = cbPalette)
The edge colors were a little faint and the nodes are a little small. call we improve this? Let’s use binary coloring for the edges (positive/negative instead of the numerical gradient) and increase the size of the nodes.
vir_graph_colours <-
vir_graph_groups %>%
activate(edges) %>%
mutate(corr_dir= if_else(weight > 0,
"positive",
"negative"))
ggraph(vir_graph_colours) +
geom_edge_link(alpha = 0.1, aes(colour = corr_dir)) +
scale_edge_color_manual(values = c("positive" = "blue",
"negative" = "red"))+
geom_node_point(aes(colour = group), size = 4) +
scale_colour_manual(values = cbPalette)
Now let’s label all of the genes in the “Other” group and change the shape of the nodes based on their grouping.
vir_graph_shapes_labels <-
vir_graph_colours %>%
activate(nodes) %>%
mutate(name_label = if_else(group == "Other",
name,
NA_character_))
ggraph(vir_graph_shapes_labels) +
geom_edge_link(alpha = 0.1, aes(colour = corr_dir)) +
scale_edge_color_manual(values = c("positive" = "blue",
"negative" = "red"))+
geom_node_point(aes(colour = group,
shape = group), size = 4) +
scale_colour_manual(values = cbPalette) +
geom_node_text(aes(label = name_label))
You can also facet based on nodes or edges. This can be great if you want to show how the same network changes over time/different conditions.
ggraph(vir_graph_shapes_labels) +
geom_edge_link(alpha = 0.1, aes(colour = corr_dir)) +
scale_edge_color_manual(values = c("positive" = "blue",
"negative" = "red"))+
geom_node_point(aes(colour = group,
shape = group), size = 4) +
scale_colour_manual(values = cbPalette) +
geom_node_text(aes(label = name_label)) +
facet_nodes(~group)
ggraph(vir_graph_shapes_labels) +
geom_edge_link(alpha = 0.1, aes(colour = corr_dir)) +
scale_edge_color_manual(values = c("positive" = "blue",
"negative" = "red"))+
geom_node_point(aes(colour = group,
shape = group), size = 4) +
scale_colour_manual(values = cbPalette) +
geom_node_text(aes(label = name_label)) +
facet_edges(~corr_dir)
You may want to characterize certain aspects of your network such as node centrality (i.e. connectedness) or cluster your nodes based on an established algorithm/network topology. Tidygraph is loaded with a ton of convenient functions to do just that! Let’s run through a quick example that calculates the centrality of our nodes and clusters them.
vir_graph_cent_clust <-
vir_graph_shapes_labels %>%
activate(nodes) %>%
mutate(
central = centrality_closeness(),
infomap = as.factor(group_infomap()))
ggraph(vir_graph_cent_clust) +
geom_edge_link(alpha = 0.1, aes(colour = corr_dir)) +
scale_edge_color_manual(values = c("positive" = "blue",
"negative" = "red"))+
geom_node_point(aes(colour = infomap,
shape = group,
size = central)) +
scale_colour_manual(values = cbPalette) +
geom_node_text(aes(label = name_label))
You can manually choose any layout supported by igraph (see here for some ideas: https://www.data-imaginist.com/2017/ggraph-introduction-layouts/). These can drastically change the appearance of your final plot. Here’s an example of the sample data visualized with 12 different layouts (“stress” is the default we’ve been using thus far).
layout_test <-
function(layout_choice,
graph_in){
plot_out <- ggraph(graph_in, layout = layout_choice) +
geom_edge_link(alpha = 0.1, aes(colour = corr_dir)) +
scale_edge_color_manual(values = c("positive" = "blue",
"negative" = "red"))+
geom_node_point(aes(colour = group,
shape = group), size = 4) +
scale_colour_manual(values = cbPalette) +
ggtitle(layout_choice) +
geom_node_text(aes(label = name_label))
return(plot_out)
}
layouts <- c('star', 'circle', 'gem', 'dh', 'graphopt', 'grid', 'mds',
'randomly', 'fr', 'drl', 'lgl', 'stress')
vir_graph_simple <- vir_graph_shapes_labels %>%
activate(nodes) %>%
filter(group != "VPI2")
plot_list <- lapply(layouts,
layout_test,
graph_in = vir_graph_simple)
plot_grid(plotlist = plot_list,
ncol = 3)
You may also want to employ some more specialized layouts in combination with specific node or edge geoms. Ggraph provides a lot of functionality including:
ggraph(vir_graph_shapes_labels, layout = "linear") +
geom_edge_arc(alpha = 0.1, aes(colour = corr_dir)) +
scale_edge_color_manual(values = c("positive" = "blue",
"negative" = "red"))+
geom_node_point(aes(colour = group,
shape = group), size = 4) +
scale_colour_manual(values = cbPalette)
ggraph(vir_graph_shapes_labels, layout = "linear", circular = TRUE) +
geom_edge_arc(alpha = 0.1, aes(colour = corr_dir)) +
scale_edge_color_manual(values = c("positive" = "blue",
"negative" = "red"))+
geom_node_point(aes(colour = group,
shape = group), size = 4) +
scale_colour_manual(values = cbPalette)
This is just a fun example, there are much better packages to make these with, e.g. ggtree.
vir_dist <- as.dist(corr_matrix)
vir_clust <- hclust(vir_dist)
clust_graph <- as_tbl_graph(vir_clust) %>%
activate(nodes) %>%
mutate(group =
case_when(label %in% vpi1 ~ "VPI1",
label %in% vpi2 ~ "VPI2",
label == "" ~ NA_character_,
TRUE ~ "Other"))
ggraph(clust_graph, layout = "dendrogram") +
geom_edge_diagonal2(aes(colour = node.group)) +
# geom_node_point(, size = 3) +
geom_node_text(aes(label = label), angle = 45, nudge_y = -0.4, nudge_x = -0.6)
We’ve covered the basics but there are a lot of edge (and node!) cases that could come up. Here are some links and tips that may be helpful as you work on your own network visualizations: